Tree
Block
library(ISLR)
data(Hitters)Hitters = na.omit(Hitters)
logSalary = log(Hitters$Salary)# logSalary has 263 observations
split = sample.split(Hitters$Salary, 2/3)
Hitters.train = subset(Hitters, split == TRUE)
Hitters.test = subset(Hitters, split == FALSE)library(gbm)## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1.3
set.seed(1)
pows = seq(-10, -0.2, by=0.1)
lambdas = 10 ^ pows
length.lambdas = length(lambdas)
train.errors = rep(NA, length.lambdas)
test.errors = rep(NA, length.lambdas)
for (i in 1:length.lambdas) {
boost.hitters = gbm(Salary~., data=Hitters.train, distribution="gaussian", n.trees=1000, shrinkage=lambdas[i])
train.pred = predict(boost.hitters, Hitters.train, n.trees=1000)
test.pred = predict(boost.hitters, Hitters.test, n.trees=1000)
train.errors[i] = mean((Hitters.train$Salary - train.pred)^2)
test.errors[i] = mean((Hitters.test$Salary - test.pred)^2)
}
plot(lambdas, train.errors, type="b", xlab="Shrinkage", ylab="Train MSE", col="blue", pch=20)plot(lambdas, test.errors, type="b", xlab="Shrinkage", ylab="Test MSE", col="red", pch=20)min(test.errors)## [1] 91826.68
lambdas[which.min(test.errors)]## [1] 0.01584893
lm.fit = lm(Salary~., data=Hitters.train)
lm.pred = predict(lm.fit, Hitters.test)
mean((Hitters.test$Salary - lm.pred)^2)## [1] 117327.9
library(glmnet)## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
set.seed(134)
x = model.matrix(Salary~., data=Hitters.train)
y = Hitters.train$Salary
x.test = model.matrix(Salary~., data=Hitters.test)
lasso.fit = glmnet(x, y, alpha=1)
lasso.pred = predict(lasso.fit, s=0.01, newx=x.test)
mean((Hitters.test$Salary - lasso.pred)^2)## [1] 116130.2
Test MSE reaches a far lower value with boosting.
summary(gbm(Salary~., data=Hitters.train, distribution="gaussian", n.trees=1000, shrinkage=lambdas[which.min(test.errors)]))## var rel.inf
## PutOuts PutOuts 11.0107533
## Walks Walks 9.7211337
## CRBI CRBI 9.4791782
## Years Years 8.6766924
## CHmRun CHmRun 7.5141256
## Hits Hits 7.1800953
## RBI RBI 7.1272807
## CRuns CRuns 6.4177745
## CAtBat CAtBat 6.3589182
## CWalks CWalks 6.1439193
## CHits CHits 5.2055833
## Assists Assists 4.0030046
## HmRun HmRun 3.1375608
## Runs Runs 2.7292957
## Errors Errors 1.9814409
## AtBat AtBat 1.6933875
## League League 0.8451764
## Division Division 0.7746796
## NewLeague NewLeague 0.0000000
set.seed(21)
rf.hitters = randomForest(Salary~., data=Hitters.train, ntree=500, mtry=19)
rf.pred = predict(rf.hitters, Hitters.test)
mean((Hitters.test$Salary - rf.pred)^2)## [1] 87754.97
| Team member | Conceptual | Applied | Contribution % |
|---|---|---|---|
| Nehemya | Yes | Yes | 100% |
| Total | 100% |